Example: Monthly electricity sales for Virginia
Briefly characterize the dataset
library(arrow)
esales <- read_feather("esales.feather")
print(esales) # print the data as a table
summary(esales) # compute basic summary statistics about the data
value date year month
Min. : 7153 Min. :2001-01-01 Min. :2001 Min. : 1.000
1st Qu.: 8200 1st Qu.:2005-11-01 1st Qu.:2005 1st Qu.: 3.000
Median : 9019 Median :2010-09-01 Median :2010 Median : 6.000
Mean : 9093 Mean :2010-08-31 Mean :2010 Mean : 6.425
3rd Qu.: 9885 3rd Qu.:2015-07-01 3rd Qu.:2015 3rd Qu.: 9.000
Max. :11724 Max. :2020-05-01 Max. :2020 Max. :12.000
boxplot(esales)

Plot the time series

# library(lubridate) # Make it easy to deal with dates
esales %>% filter(month==3) # These three lines of code
esales %>% filter(month(date)==3) # all do
esales %>% filter(lubridate::month(date)==3) # the same thing.
# We don't have to keep the 'year' and 'month' column: can recover them if needed
esales %>%
select(date, sales_GWh = value) -> esales_tbl
print(esales_tbl)
Convert the data frame into a time series tsibble object
Make some plots
Make a histogram of monthly sales
hist(elsales_tbl_ts$sales_GWh, breaks=40)

Make a seasonal plot
# This plot won't work. Why not?
elsales_tbl_ts %>%
feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
autoplot(vaelsales_tbl_ts, sales_GWh) +
ylab("Virginia monthly total electricity sales (GWh)") +
xlab("")

Seasonal plots and seasonal subseries plots
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")

vaelsales_tbl_ts %>%
gg_subseries(sales_GWh) +
labs(
y = "Sales (GWh)",
title = "Seasonal subseries plot: Virginia electricity sales"
)

Scatterplots
Investigating relationships between two variables. Scatterplots. Correlation. Scatterplot matrices.
Readings: FPP Sect. 2.6
vic_elec
summary(vic_elec)
Time Demand Temperature Date Holiday
Min. :2012-01-01 00:00:00 Min. :2858 Min. : 1.50 Min. :2012-01-01 Mode :logical
1st Qu.:2012-09-30 22:52:30 1st Qu.:3969 1st Qu.:12.30 1st Qu.:2012-09-30 FALSE:51120
Median :2013-07-01 22:45:00 Median :4635 Median :15.40 Median :2013-07-01 TRUE :1488
Mean :2013-07-01 22:45:00 Mean :4665 Mean :16.27 Mean :2013-07-01
3rd Qu.:2014-04-01 23:37:30 3rd Qu.:5244 3rd Qu.:19.40 3rd Qu.:2014-04-01
Max. :2014-12-31 23:30:00 Max. :9345 Max. :43.20 Max. :2014-12-31
vic_elec %>%
filter(year(Time) == 2013) %>%
autoplot(Demand) +
labs(
y = "Demand (GW)",
title = "Half-hourly electricity demand: Victoria"
)

vic_elec %>%
filter(year(Time) == 2013) %>%
autoplot(Temperature) +
labs(
y = "Temperature (degrees Celsius)",
title = "Half-hourly temperatures: Melbourne, Australia"
)

vic_elec %>%
filter(year(Time) == 2013) %>%
ggplot(aes(x = Temperature, y = Demand)) +
# geom_density2d() +
geom_point(size=0.1, aes(colour=Holiday), alpha = 0.4) +
labs(y = "Demand (GW)", x = "Temperature (degrees Celsius)")

A Scatterplot matrix

Example: Australian production
# install.packages('tsibbledata')
library(tsibbledata)
aus_production
aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)

Example: Gross Domestic Product data
Exploratory data analysis
library(tsibbledata) # Data sets package
print(global_economy)
global_economy %>% filter(Country=="Sweden") %>% print()
global_economy %>%
filter(Country=="Sweden") %>%
autoplot(GDP) +
ggtitle("GDP for Sweden") + ylab("$US billions")

Fitting data to simple models
global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit
7 errors (1 unique) encountered for trend_model
[7] 0 (non-NA) cases
fit
NA
NA
fit %>% filter(Country == "Sweden") %>% residuals()
fit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot(.resid)

Work with ln(GDP)
global_economy %>%
filter(Country=="Sweden") %>%
autoplot(log(GDP)) +
ggtitle("ln(GDP) for Sweden") + ylab("$US billions")

global_economy %>%
model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
7 errors (1 unique) encountered for trend_model
[7] 0 (non-NA) cases
logfit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()
Plot variable not specified, automatically selected `.vars = .resid`

global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3
7 errors (1 unique) encountered for trend_model
[7] 0 (non-NA) cases
fit3 %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()
Plot variable not specified, automatically selected `.vars = .resid`

Producing forecasts
fit %>% forecast(h = "3 years") -> fcast3yrs
fcast3yrs
NA
fcast3yrs %>% filter(Country == "Sweden", Year == 2020) %>% str()
fable [1 × 5] (S3: fbl_ts/tbl_ts/tbl_df/tbl/data.frame)
$ Country: Factor w/ 263 levels "Afghanistan",..: 232
$ .model : chr "trend_model"
$ Year : num 2020
$ GDP : dist [1:1]
..$ 3:List of 2
.. ..$ mu : num 5.45e+11
.. ..$ sigma: num 5.34e+10
.. ..- attr(*, "class")= chr [1:2] "dist_normal" "dist_default"
..@ vars: chr "GDP"
$ .mean : num 5.45e+11
- attr(*, "key")= tibble [1 × 3] (S3: tbl_df/tbl/data.frame)
..$ Country: Factor w/ 263 levels "Afghanistan",..: 232
..$ .model : chr "trend_model"
..$ .rows : list<int> [1:1]
.. ..$ : int 1
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE
- attr(*, "index")= chr "Year"
..- attr(*, "ordered")= logi TRUE
- attr(*, "index2")= chr "Year"
- attr(*, "interval")= interval [1:1] 1Y
..@ .regular: logi TRUE
- attr(*, "response")= chr "GDP"
- attr(*, "dist")= chr "GDP"
- attr(*, "model_cn")= chr ".model"
fcast3yrs %>%
filter(Country=="Sweden") %>%
autoplot(global_economy) +
ggtitle("GDP for Sweden") + ylab("$US billions")

Model residuals vs. forecast errors
Model residuals:
Your data: \(y_1, y_2, \ldots, y_T\)
Fitted values: \(\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T\)
Model residuals: \(e_t = y_t - \hat{y}_t\)
Forecast errors:
augment(fit)
augment(fit) %>% filter(Country == "Sweden") %>%
ggplot(aes(x = .resid)) +
geom_histogram(bins = 20) +
ggtitle("Histogram of residuals")

Example: GDP, several countries
library(tsibbledata) # Data sets package
nordic <- c("Sweden", "Denmark", "Norway", "Finland")
(global_economy %>% filter(Country %in% nordic) -> nordic_economy)
NA
nordic_economy %>% autoplot(GDP)

fitnord <- nordic_economy %>%
model(
trend_model = TSLM(GDP ~ trend()),
trend_model_ln = TSLM(log(GDP) ~ trend()),
ets = ETS(GDP ~ trend("A")),
arima = ARIMA(GDP)
)
fitnord
fitnord %>%
select(arima) %>%
coef()
Denmark: ARMA(1,1)
Finland: MA(2)
Norway: MA(1)
Sweden: MA(2)
nordic_economy %>%
model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.4 errors (1 unique) encountered for arima_constrained
[4] Could not find an appropriate ARIMA model.
This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
fitnord %>% coef()
fitnord %>% glance()
fitnord %>% filter(Country == "Denmark") %>% select(arima) %>% report()
Series: GDP
Model: ARIMA(1,1,1)
Coefficients:
ar1 ma1
-0.3898 0.7240
s.e. 0.2061 0.1434
sigma^2 estimated as 2.407e+20: log likelihood=-1417.5
AIC=2840.99 AICc=2841.45 BIC=2847.12
fitnord %>%
accuracy() %>%
arrange(Country, MPE)
# ETS forecasts
USAccDeaths %>%
ets() %>%
forecast() %>%
autoplot()
str(taylor)
plot(taylor)
Plot lagged values
vaelsales_tbl_ts %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)
vaelsales_tbl_ts %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)
vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()
# decompose(vaelsales_tbl_ts)
vaelsales_tbl_ts %>%
model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
components() %>%
autoplot()
vaelsales_tbl_ts %>%
mutate(ln_sales_GWh = log(sales_GWh)) %>%
model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
robust = TRUE)) %>%
components() %>%
autoplot()
vaelsales_tbl_ts %>%
features(sales_GWh, feat_stl)
vaelsales_tbl_ts %>%
features(sales_GWh, feature_set(pkgs="feasts"))
References
---
title:     "Exploratory analysis of time series data: Examples"
institute: "SYS 5581 Time Series & Forecasting, Spring 2021" 
author:     "Instructor: Arthur Small"
date:       "Version of `r Sys.Date()`"
output: 
  html_notebook:
    number_sections: true
    toc: yes
    toc_depth: 4
    code_folding: show # options: show, hide
    fig_caption: yes
  # html_document:
  #       keep_md: yes
  # pdf_document: default
bibliography: /Users/Arthur/GitRepos/Teaching/time-series/tseries.bib
link-citations: yes
---

```{r set up coding environment, include=FALSE, message=FALSE}
# library(dplyr) -- don't need this if you are loading the entire 'tidyverse' suite
library(tidyverse)
library(lubridate) # For easy handling of time-indexed objects

# if(!('fpp3' %in% installed.packages())) install.packages('fpp3')
library(fpp3)
```


# Example: Monthly electricity sales for Virginia

## Extract data from remote database

```{r open connection to database, eval=FALSE, include=FALSE}
# Open connection to a remote database
# Make sure your VPN network connection is active if needed!

# if(!('RPostgreSQL' %in% installed.packages())) install.packages('RPostgreSQL')
library(RPostgreSQL)

# "my_postgres_credentials.R" contains the log-in information
source("/Users/Arthur/GitRepos/Teaching/my_postgres_db_credentials.R")

# Open connection
db_driver <- dbDriver("PostgreSQL")
db <- dbConnect(db_driver,user=user, password=password,dbname="postgres", host=host)
rm(password) 

# check the connection: If function returns value TRUE, the connection is working
dbExistsTable(db, "metadata")
```

```{r retrieve data from db, eval=FALSE, message=FALSE}
esales <- dbGetQuery(db,'SELECT * from eia_elec_sales_va_all_m') # SQL code to retrieve data from a table in the remote database
# str(esales)
esales <- as_tibble(esales) # Convert dataframe to a 'tibble' for tidyverse work
# str(esales)
```

```{r save data in Apache Arrow format, eval=FALSE}
# Reference: https://arrow.apache.org/docs/r/
# if(!('arrow' %in% installed.packages())) install.packages('arrow')
library(arrow)
write_feather(esales, "esales.feather")
```

```{r close db connection, eval=FALSE}
# Close connection -- this is good practice
dbDisconnect(db)
dbUnloadDriver(db_driver)
```

## Briefly characterize the dataset

```{r read in data}
library(arrow)
esales <- read_feather("esales.feather")
```

```{r }
print(esales)    # print the data as a table
summary(esales)  # compute basic summary statistics about the data
boxplot(esales)
```

```{r use tidyverse syntax to perform some simple data manipulations}
# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/
# filter(data object, condition) : syntax for filter() command

esales %>%
  filter(year == 2019) %>%
  filter(value > 9000) %>%
  print()

(esales %>%
  group_by(year) %>%
  summarise(Total = sum(value)) -> total_esales_by_year)

esales %>%
  mutate(sales_TWh = value/1000) %>%
  select(-value)
```

## Plot the time series

```{r use ggplot2 to generate a plot}
#Reference: https://ggplot2.tidyverse.org/

ggplot(data=esales, aes(x=date,y=value)) +
  geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")
```

```{r}
# library(lubridate) # Make it easy to deal with dates

esales %>% filter(month==3)                   # These three lines of code
esales %>% filter(month(date)==3)             #   all do
esales %>% filter(lubridate::month(date)==3)  #   the same thing.

# We don't have to keep the 'year' and 'month' column: can recover them if needed

esales %>%
  select(date, sales_GWh = value) -> esales_tbl

print(esales_tbl)
```

## Convert the data frame into a time series `tsibble` object


```{r}
# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html

esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts

print(elsales_tbl_ts)
```

## Make some plots

###  Make a histogram of monthly sales

```{r make a histogram of the data}
hist(elsales_tbl_ts$sales_GWh, breaks=40)
```

###  Make a seasonal plot

```{r, eval=FALSE}
# This plot won't work. Why not?
elsales_tbl_ts %>% 
  feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```

```{r Change the tsibble so index is monthly}
# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)

elsales_tbl_ts %>% 
  mutate(Month = tsibble::yearmonth(date)) %>% 
  as_tsibble(index = Month) %>% 
  select(Month,sales_GWh) -> vaelsales_tbl_ts

print(vaelsales_tbl_ts)
```
```{r}
autoplot(vaelsales_tbl_ts, sales_GWh) + 
  ylab("Virginia monthly total electricity sales (GWh)") + 
  xlab("")  # Leave horiz. axis label blank
```
## Seasonal plots and seasonal subseries plots

```{r , warning=FALSE}
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```


```{r}
vaelsales_tbl_ts %>% 
  gg_subseries(sales_GWh) +
  labs(
    y = "Sales (GWh)",
    title = "Seasonal subseries plot: Virginia electricity sales"
  )
```

## Scatterplots

Investigating relationships between two variables. Scatterplots. Correlation. Scatterplot matrices.

Readings: FPP Sect. 2.6

```{r}
vic_elec

summary(vic_elec)

vic_elec %>%
  filter(year(Time) == 2013) %>%
  autoplot(Demand) +
  labs(
    y = "Demand (GW)",
    title = "Half-hourly electricity demand: Victoria"
  )
```
```{r}
vic_elec %>%
  filter(year(Time) == 2013) %>%
  autoplot(Temperature) +
  labs(
    y = "Temperature (degrees Celsius)",
    title = "Half-hourly temperatures: Melbourne, Australia"
  )
```

```{r}
vic_elec %>%
  filter(year(Time) == 2013) %>%
  ggplot(aes(x = Temperature, y = Demand)) + 
#  geom_density2d() +
  geom_point(size=0.1, aes(colour=Holiday), alpha = 0.4) +
  labs(y = "Demand (GW)", x = "Temperature (degrees Celsius)")
```
A Scatterplot matrix

```{r, warning=FALSE}
vic_elec

boxplot(vic_elec$Temperature)

# install.packages("GGally")
vic_elec %>% 
  # mutate(Temperature = round(Temperature)) %>%
  # pivot_wider(values_from=c(Demand,Temperature), names_from=Holiday) %>%
  GGally::ggpairs(columns = 3:2)

vic_elec %>% 
  mutate(Year = factor(year(Date))) %>%
  select(-c(Date, Holiday)) %>%
  GGally::ggpairs(columns = 4:2)
```

# Example: Australian production

```{r, warning=FALSE}
# install.packages('tsibbledata')
library(tsibbledata)

aus_production

aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)
```

# Example: Gross Domestic Product data

## Exploratory data analysis

```{r}
library(tsibbledata) # Data sets package

print(global_economy)
```

```{r}
global_economy %>% filter(Country=="Sweden") %>% print()
```

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(GDP) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Fitting data to simple models

```{r}
global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit

fit


```

```{r}
fit %>% filter(Country == "Sweden") %>% residuals()
```

```{r}

fit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot(.resid)
```

### Work with ln(GDP)

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(log(GDP)) +
  ggtitle("ln(GDP) for Sweden") + ylab("$US billions")
```

```{r}
global_economy %>%
  model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
```

```{r}
logfit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()
```

```{r}
global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3

fit3 %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()

```

# Producing forecasts

```{r}
fit %>% forecast(h = "3 years") -> fcast3yrs

fcast3yrs

```

```{r}

fcast3yrs %>% filter(Country == "Sweden", Year == 2020) %>% str()
```

```{r visualize forecasts}
fcast3yrs %>% 
  filter(Country=="Sweden") %>%
  autoplot(global_economy) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Model residuals vs. forecast errors

Model residuals:

Your data: $y_1, y_2, \ldots, y_T$

Fitted values: $\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T$

Model residuals: $e_t = y_t - \hat{y}_t$

Forecast errors:

```{r}
augment(fit)
```

```{r}
augment(fit) %>% filter(Country == "Sweden") %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 20) +
  ggtitle("Histogram of residuals")
```

## Are the model residuals auto-correlated?

```{r}
augment(fit) %>% filter(Country == "Sweden") -> augSweden

augSweden %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```

```{r}
augment(fit3) %>% filter(Country == "Sweden") -> augSweden3

augSweden3 %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```


# Example: GDP, several countries


```{r}
library(tsibbledata) # Data sets package

nordic <- c("Sweden", "Denmark", "Norway", "Finland")

(global_economy %>% filter(Country %in% nordic) -> nordic_economy)

```

```{r}
nordic_economy %>% autoplot(GDP)
```

```{r}
fitnord <- nordic_economy %>%
  model(
    trend_model = TSLM(GDP ~ trend()),
    trend_model_ln = TSLM(log(GDP) ~ trend()),
    ets = ETS(GDP ~ trend("A")),
    arima = ARIMA(GDP)
  )

fitnord
```

```{r}
fitnord %>%
  select(arima) %>%
  coef()
```


Denmark: ARMA(1,1)

Finland: MA(2)

Norway: MA(1)

Sweden: MA(2)

```{r}
nordic_economy %>%
  model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
```

```{r}
fitnord %>% coef() 
```

```{r}
fitnord %>%  glance()  
```

```{r}
fitnord %>% filter(Country == "Denmark") %>% select(arima) %>% report()
```

```{r}
fitnord %>%
  accuracy() %>%
  arrange(Country, MPE)
```



```{r, eval=FALSE}
# ETS forecasts
USAccDeaths %>%
  ets() %>%
  forecast() %>%
  autoplot()
```

```{r, eval=FALSE}
str(taylor)
plot(taylor)
```

## Plot lagged values

```{r plot lagged values}
vaelsales_tbl_ts  %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts  %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)
```

```{r}
vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()
```

```{r perform automated time series decomposition}


# decompose(vaelsales_tbl_ts)
```

```{r perform additive STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r perform multiplicative STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  mutate(ln_sales_GWh = log(sales_GWh)) %>%
  model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
    robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feat_stl)
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feature_set(pkgs="feasts"))
```



# References
